package utils;
import java.io.BufferedReader;
import java.io.EOFException;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.ObjectOutput;
import java.io.ObjectOutputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.StringTokenizer;

/**
 * Generates a serialised Map containing a shortened version of the British 
 * National Corpus containing only words which occur over 100 times in the corpus.
 *  It is not necessary to alter this file to complete AIPJ assignment 1. 
 * 
 * <p> This class may come in
 * useful for msc projects </p>
 * 
 * @author Judy Robertson
 */
public class GenerateShortBNC {

    /**
     * A map data structure for relating words with their frequency in the
     * BNC
     */
    public Map bnc;

    /**Open the bnc word frequencies file. The file is formatted as lines of:
     * 1: frequency
     * 2: word
     * 3: pos
     * 4: number of files the word occurs in
     * Only the word and frequency are required here
      * **/

    private void readinBNC() {

        String line;
        BNCWord temp = new BNCWord();
        ArrayList filecontents = new ArrayList();
        String word = "";
        String pos = "";
        int frequency = 0;
        String s;
        BufferedReader in;

        try {
            in =
                new BufferedReader(
                    new FileReader(
                        System.getProperty("user.home") + File.separator + "wordnet" + File.separator + "data"
                         + File.separator + "all_num_o5.txt"));

            //get a line at a time out the file and store it as a string

            while ((s = in.readLine()) != null) {

                filecontents.add(s);

            }
            // in.close();
        } catch (EOFException e) {

            e.printStackTrace();
        } catch (IOException o) {
            o.printStackTrace();
        }
        StringTokenizer tokens;

        //now process each line of text

        for (int i = 0; i < filecontents.size(); i++) {

            temp = new BNCWord();
            line = (String) filecontents.get(i);

            tokens = new StringTokenizer(line, " ");
            if (tokens.hasMoreTokens()){
                frequency = Integer.parseInt(tokens.nextToken());
            }
            if (tokens.hasMoreTokens()){
                word = tokens.nextToken().toLowerCase();
            }

            if ((word != null) && (frequency > 100)) {

                temp.setWord(word);

                temp.setFrequency(frequency);
                bnc.put(word, temp);
            }
        }

    }

    /**
     *This writes out the BNC words as serialised objects
     */
    public void writeBNC() {

        File f = new File(System.getProperty("user.home") + File.separator + 
                "wordnet" + File.separator + "data" + File.separator + "bncobjects.dat");
        try {
            FileOutputStream fstrm = new FileOutputStream(f);
            ObjectOutput ostrm = new ObjectOutputStream(fstrm);

            ostrm.writeObject(bnc);
            ostrm.flush();

        } catch (IOException e) {
            e.printStackTrace();
        }

    }

    /**
     * Reads in the text file containing the BNC information, filters out
     * infrequent words and then writes out a map containing BNC words.
     * 
     * @param args No arguments needed
     */
    public static void main(String args[]) {
        GenerateShortBNC g = new GenerateShortBNC();
        g.readinBNC();
        g.writeBNC();

    }

    /**
     * Constructs a new GenerateShortBNC object and initialises the data
     * structure
     */
    public GenerateShortBNC() {

        bnc = new HashMap();

    }

}
